Originality declaration

I, [Yanpu Huang], confirm that the work presented in this assessment is my own. Where information has been derived from other sources, I confirm that this has been indicated in the work.

date: 20 December, 2021

Start your response here

Initial project scope

  • Find the spatial pattern of adults obesity in London boroughs.
  • Analyse the linear correlation between adult obesity and several nutrients in food in each London boroughs

Packages

library(tidyverse)
library(tmap)
library(rgdal)
library(broom)
library(mapview)
library(crosstalk)
library(sf)
library(sp)
library(spdep)
library(car)
library(fs)
library(janitor)
library(here)
library(landsat)
library(broom)
library(tidypredict)
library(spatstat)
library(here)
library(sp)
library(rgeos)
library(maptools)
library(GISTools)
library(geojson)
library(geojsonio)
library(tmaptools)
library(spatialreg)
library(spgwr)

Data processing

Read the datasets from “data” file.

LondonBoroughs <- st_read(here::here("data", "statistical-gis-boundaries-london", "ESRI", "London_Borough_Excluding_MHW.shp")) %>%
  st_transform(., 27700) %>%
  clean_names()
## Reading layer `London_Borough_Excluding_MHW' from data source 
##   `/Users/calvinhuangk/Desktop/CASA/modules/GIS/casa0005-final-exam-HuangK1227/data/statistical-gis-boundaries-london/ESRI/London_Borough_Excluding_MHW.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 33 features and 7 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: 503568.2 ymin: 155850.8 xmax: 561957.5 ymax: 200933.9
## Projected CRS: OSGB 1936 / British National Grid
obesity <- read_csv(here::here("data", "london_obesity_borough_2012.csv")) %>%
  clean_names()
## Rows: 33 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): oslaua
## dbl (4): f_healthy_weight, f_overweight, f_obese, weighted_sample
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
grocery <- read_csv(here::here("data", "year_borough_grocery.csv")) %>%
    clean_names()
## Rows: 33 Columns: 202
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (1): area_id
## dbl (201): weight, weight_perc2.5, weight_perc25, weight_perc50, weight_perc...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# msoa <- read_csv(here::here("data", "msoa-data.csv")) %>%
#     clean_names()

lb_profile1 <- LondonBoroughs%>%
   merge(.,
        obesity, 
        by.x="gss_code", 
        by.y="oslaua",
        no.dups = TRUE)%>%
  distinct(.,gss_code, 
           .keep_all = TRUE)

lb_profile2 <- lb_profile1%>%
   merge(.,
        grocery, 
        by.x="gss_code", 
        by.y="area_id",
        no.dups = TRUE)%>%
  distinct(.,gss_code, 
           .keep_all = TRUE) %>%
   mutate(total_fat = fat*num_transactions)%>%
   mutate(total_carb = carb*num_transactions)%>%
   mutate(total_sugar = sugar*num_transactions)%>%
   mutate(total_salt = salt*num_transactions)%>%
   mutate(total_protein = protein*num_transactions)%>%
   mutate(total_saturate = saturate*num_transactions)%>%
   mutate(total_fibre = fibre*num_transactions)%>%
   mutate(total_alcohol = alcohol*num_transactions)

lb_profile3 <- lb_profile2%>%
  subset(., select = c(gss_code, name, f_obese, total_fat, total_carb, total_sugar, total_salt, total_protein, total_saturate, total_fibre, total_alcohol, fat, sugar, saturate, carb, fibre, protein, salt, alcohol))

Spatial pattern

Density map

Make the basic obesity rate map in London

tmap_mode("plot")
## tmap mode set to plotting
tm_shape(lb_profile3) +
    tm_polygons("f_obese",
        style="jenks",
        palette="PuOr",
        midpoint=NA,
        title="obesity_rate")

We could see that the south-western of London has high obesity rate

Spatial auto-correlation(global)

Make the coordinates data prepared for Moran’s I test and other tests, to see the spatial auto-correlation of obesity rate

coords_lb <- lb_profile3%>%
  st_centroid()%>%
  st_geometry()
plot(coords_lb,axes=TRUE)

lb_nb <- lb_profile3 %>%
  poly2nb(., queen=T)

summary(lb_nb)
## Neighbour list object:
## Number of regions: 33 
## Number of nonzero links: 136 
## Percentage nonzero weights: 12.48852 
## Average number of links: 4.121212 
## Link number distribution:
## 
##  2  3  4  5  6  7 
##  2 10  9  7  4  1 
## 2 least connected regions:
## 4 16 with 2 links
## 1 most connected region:
## 5 with 7 links
#plot them
plot(lb_nb, st_geometry(coords_lb), col="red")
#add a map underneath
plot(lb_profile3$geometry, add=T)

lb.lw <- lb_nb %>%
  nb2mat(., style="B")

sum(lb.lw)
## [1] 136
sum(lb.lw[,1])
## [1] 5
lb.lw <- lb_nb %>%
  nb2listw(., style="C")
lb.lw
## Characteristics of weights list object:
## Neighbour list object:
## Number of regions: 33 
## Number of nonzero links: 136 
## Percentage nonzero weights: 12.48852 
## Average number of links: 4.121212 
## 
## Weights style: C 
## Weights constants summary:
##    n   nn S0       S1       S2
## C 33 1089 33 16.01471 143.6613

Do the several spatial model tests

i_lb_global_obesity <- lb_profile3 %>%
  pull(f_obese) %>%
  as.vector()%>%
  moran.test(., lb.lw)

i_lb_global_obesity
## 
##  Moran I test under randomisation
## 
## data:  .  
## weights: lb.lw    
## 
## Moran I statistic standard deviate = 0.73232, p-value = 0.232
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##        0.05038149       -0.03125000        0.01242566
c_lb_global_obesity <- lb_profile3 %>%
  pull(f_obese) %>%
  as.vector()%>%
  geary.test(., lb.lw)

c_lb_global_obesity
## 
##  Geary C test under randomisation
## 
## data:  . 
## weights: lb.lw 
## 
## Geary C statistic standard deviate = 1.1834, p-value = 0.1183
## alternative hypothesis: Expectation greater than statistic
## sample estimates:
## Geary C statistic       Expectation          Variance 
##         0.8433236         1.0000000         0.0175270
g_lb_global_obesity <-  lb_profile3 %>%
  pull(f_obese) %>%
  as.vector()%>%
  globalG.test(., lb.lw)

g_lb_global_obesity
## 
##  Getis-Ord global G statistic
## 
## data:  . 
## weights: lb.lw 
## 
## standard deviate = -1.0102, p-value = 0.8438
## alternative hypothesis: greater
## sample estimates:
## Global G statistic        Expectation           Variance 
##       3.040045e-02       3.125000e-02       7.072561e-07
  • The Moran’s I: statistic > 0, positive auto-correlation
  • The Geary’s C: statistic < 1, postive auto-correlation
  • The General G: statistic < expected, but really close,so normal values are tending to cluster.

So we could say the spatial auto-correlation of obesity rate in London is very slightly positive.

Then focus on local moran

I_lb_Local_obesity <- lb_profile3 %>%
  pull(f_obese) %>%
  as.vector()%>%
  localmoran(., lb.lw)%>%
  as_tibble()


slice_head(I_lb_Local_obesity, n=5)
## # A tibble: 5 × 5
##   Ii           E.Ii          Var.Ii      Z.Ii        `Pr(z != E(Ii))`
##   <localmrn>   <localmrn>    <localmrn>  <localmrn>  <localmrn>      
## 1 -2.030943880 -0.2120364582 1.220328862 -1.64653950 0.09965273      
## 2  0.406556599 -0.1316443244 0.807781023  0.59882182 0.54929171      
## 3  0.001638183 -0.0007847383 0.005469339  0.03276211 0.97386429      
## 4  0.168347659 -0.0064720866 0.049483675  0.78588592 0.43193432      
## 5  0.067983680 -0.0001904077 0.001229428  1.94432019 0.05185685
lb_profile3 <- lb_profile3 %>%
  mutate(count_I = as.numeric(I_lb_Local_obesity$Ii))%>%
  mutate(count_Iz =as.numeric(I_lb_Local_obesity$Z.Ii))%>%
  mutate(density_I =as.numeric(I_lb_Local_obesity$Ii))%>%
  mutate(density_Iz =as.numeric(I_lb_Local_obesity$Z.Ii))

Plot the local moran z score map

breaks1<-c(-1000,-2.58,-1.96,-1.65,1.65,1.96,2.58,1000)
MoranColours<- rev(brewer.pal(8, "RdGy"))

tmap_mode("plot")
## tmap mode set to plotting
tm_shape(lb_profile3) +
    tm_polygons("count_Iz",
        style="fixed",
        breaks=breaks1,
        palette=MoranColours,
        midpoint=NA,
        title="Local Moran's I z score, adult obesity rate in London boroughs")

Plot the local Gi* map

Gi_lb_Local_Obesity <- lb_profile3 %>%
  pull(f_obese) %>%
  as.vector()%>%
  localG(., lb.lw)

head(Gi_lb_Local_Obesity)
## [1] -1.64653950  0.59882182  0.03276211  0.78588592 -1.94432019  0.98516705
lb_profile3 <- lb_profile3 %>%
  mutate(obesity_G = as.numeric(Gi_lb_Local_Obesity))

GIColours<- rev(brewer.pal(8, "RdBu"))

tmap_mode("plot")
## tmap mode set to plotting
tm_shape(lb_profile3) +
    tm_polygons("obesity_G",
        style="fixed",
        breaks=breaks1,
        palette=GIColours,
        midpoint=NA,
        title="local Gi*, Adult obesity in London boroughs ")

Analysis

Now we gonna make the analysis by models and data visualisation qplot to see the linear regression’s fitted line of raw data

q1 <- qplot(x = `total_fat`, 
           y = `f_obese`, 
           data=lb_profile3)
q1 + stat_smooth(method="lm", se=FALSE, size=1) + 
  geom_jitter()
## `geom_smooth()` using formula 'y ~ x'

q2 <- qplot(x = `total_sugar`, 
           y = `f_obese`, 
           data=lb_profile3)
q2 + stat_smooth(method="lm", se=FALSE, size=1) + 
  geom_jitter()
## `geom_smooth()` using formula 'y ~ x'

q3 <- qplot(x = `total_saturate`, 
           y = `f_obese`, 
           data=lb_profile3)
q3 + stat_smooth(method="lm", se=FALSE, size=1) + 
  geom_jitter()
## `geom_smooth()` using formula 'y ~ x'

q4 <- qplot(x = `total_carb`, 
           y = `f_obese`, 
           data=lb_profile3)
q4 + stat_smooth(method="lm", se=FALSE, size=1) + 
  geom_jitter()
## `geom_smooth()` using formula 'y ~ x'

q5 <- qplot(x = `total_fibre`, 
           y = `f_obese`, 
           data=lb_profile3)
q5 + stat_smooth(method="lm", se=FALSE, size=1) + 
  geom_jitter()
## `geom_smooth()` using formula 'y ~ x'

Distribution seems like not following the normal distribution, so we need to check it in next the step

Data distribution

Make distribution histogram to check if it is follow normal distribution.

#dependent variable
ggplot(lb_profile3, aes(x=f_obese)) + 
  geom_histogram(aes(y = ..density..))+
  geom_density(colour="red",
               size=1,
               adjust=1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#independent variables
ggplot(lb_profile3, aes(x=total_fat)) + 
  geom_histogram(aes(y = ..density..))+
  geom_density(colour="red",
               size=1,
               adjust=1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(lb_profile3, aes(x=total_sugar)) + 
  geom_histogram(aes(y = ..density..))+
  geom_density(colour="red",
               size=1,
               adjust=1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(lb_profile3, aes(x=total_saturate)) + 
  geom_histogram(aes(y = ..density..))+
  geom_density(colour="red",
               size=1,
               adjust=1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(lb_profile3, aes(x=total_carb)) + 
  geom_histogram(aes(y = ..density..))+
  geom_density(colour="red",
               size=1,
               adjust=1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(lb_profile3, aes(x=total_fibre)) + 
  geom_histogram(aes(y = ..density..))+
  geom_density(colour="red",
               size=1,
               adjust=1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(lb_profile3, aes(x=fat)) + 
  geom_histogram(aes(y = ..density..))+
  geom_density(colour="red",
               size=1,
               adjust=1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Most independents unfollow the normal distribution Do the log transform or exponential transform, to fit log-normal distribution or exponential-normal distribution

Use box-plot to see if we should select log or exponential

symbox(~f_obese, 
       lb_profile3, 
       na.rm=T,
       powers=seq(-3,3,by=.5))

symbox(~total_fat, 
       lb_profile3, 
       na.rm=T,
       powers=seq(-3,3,by=.5))

symbox(~total_sugar, 
       lb_profile3, 
       na.rm=T,
       powers=seq(-3,3,by=.5))

symbox(~total_saturate, 
       lb_profile3, 
       na.rm=T,
       powers=seq(-3,3,by=.5))

symbox(~total_carb, 
       lb_profile3, 
       na.rm=T,
       powers=seq(-3,3,by=.5))

symbox(~total_fibre, 
       lb_profile3, 
       na.rm=T,
       powers=seq(-3,3,by=.5))

symbox(~carb, 
       lb_profile3, 
       na.rm=T,
       powers=seq(-3,3,by=.5))

According to box-plot, we could find that exponential(^0.5) is a suitable transform for out data to fit normal distribution

ggplot(lb_profile3, aes(x=f_obese^0.5)) + 
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(lb_profile3, aes(x=total_fat^0.5)) + 
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(lb_profile3, aes(x=total_sugar^0.5)) + 
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(lb_profile3, aes(x=total_saturate^0.5)) + 
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(lb_profile3, aes(x=total_carb^0.5)) + 
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(lb_profile3, aes(x=total_fibre^0.5)) + 
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Ols linear regression model

Using the variables we make the distribution transforming, and do the ols regression model, compare r-square, p-value, vif-value, coefficient and find the best model.

model1 <- lb_profile3%>%
  lm(f_obese ~ total_fat+total_sugar+total_saturate+total_carb+total_fibre, data = .)

tidy(model1)
## # A tibble: 6 × 5
##   term                estimate   std.error statistic  p.value
##   <chr>                  <dbl>       <dbl>     <dbl>    <dbl>
## 1 (Intercept)    21.3          1.36           15.6   4.90e-15
## 2 total_fat      -0.000000488  0.000000379    -1.29  2.08e- 1
## 3 total_sugar    -0.0000000294 0.000000214    -0.137 8.92e- 1
## 4 total_saturate -0.000000169  0.000000840    -0.201 8.42e- 1
## 5 total_carb      0.000000236  0.000000115     2.05  4.99e- 2
## 6 total_fibre     0.000000580  0.000000637     0.910 3.71e- 1
glance(model1)
## # A tibble: 1 × 12
##   r.squared adj.r.squared sigma statistic p.value    df logLik   AIC   BIC
##       <dbl>         <dbl> <dbl>     <dbl>   <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1     0.314         0.187  4.49      2.47  0.0572     5  -93.1  200.  211.
## # … with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
vif(model1)
##      total_fat    total_sugar total_saturate     total_carb    total_fibre 
##      4150.0884      1597.1585      3197.4571      1581.4060       383.2046

The p-value is too high in model1 Also vif value is high too.

model2 <- lb_profile3%>%
  lm(I(f_obese^0.5) ~ I(total_fat^0.5)+ I(total_sugar^0.5) + I(total_saturate^0.5)+ I(total_carb^0.5)+I(total_fibre^0.5), data = .)

tidy(model2)
## # A tibble: 6 × 5
##   term                   estimate std.error statistic  p.value
##   <chr>                     <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)            4.75      0.254       18.7   5.50e-17
## 2 I(total_fat^0.5)      -0.00182   0.00121     -1.51  1.43e- 1
## 3 I(total_sugar^0.5)    -0.000367  0.000727    -0.504 6.18e- 1
## 4 I(total_saturate^0.5)  0.000251  0.00159      0.158 8.76e- 1
## 5 I(total_carb^0.5)      0.00125   0.000537     2.33  2.77e- 2
## 6 I(total_fibre^0.5)     0.000595  0.000971     0.613 5.45e- 1
glance(model2)
## # A tibble: 1 × 12
##   r.squared adj.r.squared sigma statistic p.value    df logLik   AIC   BIC
##       <dbl>         <dbl> <dbl>     <dbl>   <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1     0.413         0.305 0.467      3.81 0.00973     5  -18.4  50.7  61.2
## # … with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
vif(model2)
##      I(total_fat^0.5)    I(total_sugar^0.5) I(total_saturate^0.5) 
##             6610.2402             2584.0167             4546.0260 
##     I(total_carb^0.5)    I(total_fibre^0.5) 
##             2655.7883              764.7656

The vif value is too high in model2, but the number_transactions’ multiple should be concerned about, because it is using total number of each nutrient

model3 <- lb_profile3%>%
  lm(f_obese^0.5 ~ fat+sugar+saturate+carb+fibre, data = .)

tidy(model3)
## # A tibble: 6 × 5
##   term        estimate std.error statistic p.value
##   <chr>          <dbl>     <dbl>     <dbl>   <dbl>
## 1 (Intercept)   13.2       5.06      2.60   0.0149
## 2 fat           -1.59      0.919    -1.73   0.0942
## 3 sugar          0.499     0.347     1.44   0.162 
## 4 saturate       0.465     1.65      0.282  0.780 
## 5 carb           0.162     0.212     0.765  0.451 
## 6 fibre         -2.47      2.74     -0.903  0.374
glance(model3)
## # A tibble: 1 × 12
##   r.squared adj.r.squared sigma statistic p.value    df logLik   AIC   BIC
##       <dbl>         <dbl> <dbl>     <dbl>   <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1     0.421         0.314 0.463      3.93 0.00830     5  -18.1  50.3  60.7
## # … with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
vif(model3)
##      fat    sugar saturate     carb    fibre 
## 7.009292 9.772034 3.599163 9.818239 2.286981

Low p value and nice r-square perfomed in model3, and the vif value is lower than 10 too, so means low correlation and multicolinearity

Make the residuals distributino histogram, check if they are following the normal distribution

model_data1 <- model1 %>%
  augment(., lb_profile3)
model_data2 <- model2 %>%
  augment(., lb_profile3)
model_data3 <- model3 %>%
  augment(., lb_profile3)


#plot residuals
model_data1%>%
dplyr::select(.resid)%>%
  pull()%>%
  qplot()+ 
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

model_data2%>%
dplyr::select(.resid)%>%
  pull()%>%
  qplot()+ 
  geom_histogram() 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

model_data3%>%
dplyr::select(.resid)%>%
  pull()%>%
  qplot()+ 
  geom_histogram() 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

They seems like all fit normal distribution

Make the residuals plot, to check the homoscedasticity

par(mfrow=c(2,2))    #plot to 2 by 2 array
plot(model1)

par(mfrow=c(2,2))    #plot to 2 by 2 array
plot(model2)

par(mfrow=c(2,2))    #plot to 2 by 2 array
plot(model3)

  • Therefore, comparing all the results above. The residuals plot of model3 shows that the fitted line is better close to horizontal, and Q-Q plot is also good linear fitted, which performs the best in 3 models, means there is nearly Homoscedasticity.
  • The r-square is 0.4213574% which means about 42% of the samples of the result is persuasive and convinced, and the p-value is much lower than 0.05, which means there is little possibility of null hypothesis.
  • VIF is all smaller than 10, which means there is low influence of multicolinearity.
  • Therefore I am choosing model3 in ols part

Spatial regression

Now we need to focus on the influence of spatial auto-correlation of model3, and consider about doing the spatial regression to move the influence of spatial correlation

DW <- durbinWatsonTest(model3)
tidy(DW)
## # A tibble: 1 × 5
##   statistic p.value autocorrelation method             alternative
##       <dbl>   <dbl>           <dbl> <chr>              <chr>      
## 1      1.17  0.0120           0.289 Durbin-Watson Test two.sided

DW test(<2) shows that there is a little positive auto-spatial correlation, let’s check the residuals map of model3 Add data of residuals of model3

lb_profile3 <- lb_profile3 %>%
  mutate(model3resids = residuals(model3)) %>%
  mutate(model2resids = residuals(model2))

Plot the residuals map of model3

tmap_mode("plot")
## tmap mode set to plotting
tm_shape(lb_profile3) +
  tm_polygons("model2resids",
              palette = "RdYlBu")
## Variable(s) "model2resids" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.

tmap_mode("plot")
## tmap mode set to plotting
tm_shape(lb_profile3) +
  tm_polygons("model3resids",
              palette = "RdYlBu")
## Variable(s) "model3resids" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.

Same colors shows the spatial correlation roughly, but we need Moran’s I test to check more details.

Make the queen neighbors lists/ list.weight and k nearest neighbors lists/list.weight.

coordsW <- lb_profile3%>%
  st_centroid()%>%
  st_geometry()
## Warning in st_centroid.sf(.): st_centroid assumes attributes are constant over
## geometries of x
lb_qnb <- lb_profile3 %>%
  poly2nb(., queen=T)

#or nearest neighbours
lb_knn <-coordsW %>%
  knearneigh(., k=4) %>%
  knn2nb()

plot(lb_qnb, st_geometry(coordsW), col="red")

plot(lb_knn, st_geometry(coordsW), col="red")

lb.queens_weight <- lb_qnb %>%
  nb2listw(., style="W")

lb.knn_4_weight <- lb_knn %>%
  nb2listw(., style="W")

Use Moran I’s test to check the spatial auto-correlation of the residuals of model3 more precisely.

Queen_model3 <- lb_profile3 %>%
  st_drop_geometry()%>%
  dplyr::select(model3resids)%>%
  pull()%>%
  moran.test(., lb.queens_weight)%>%
  tidy()

K_Nearest_neighbour_model3 <- lb_profile3 %>%
  st_drop_geometry()%>%
  dplyr::select(model3resids)%>%
  pull()%>%
  moran.test(., lb.knn_4_weight)%>%
  tidy()


Queen_model3
## # A tibble: 1 × 7
##   estimate1 estimate2 estimate3 statistic p.value method             alternative
##       <dbl>     <dbl>     <dbl>     <dbl>   <dbl> <chr>              <chr>      
## 1   -0.0485   -0.0312    0.0131    -0.151   0.560 Moran I test unde… greater
K_Nearest_neighbour_model3
## # A tibble: 1 × 7
##   estimate1 estimate2 estimate3 statistic p.value method             alternative
##       <dbl>     <dbl>     <dbl>     <dbl>   <dbl> <chr>              <chr>      
## 1    -0.132   -0.0312    0.0107    -0.976   0.835 Moran I test unde… greater

We can now make sure low negative-spatial auto-correlation exists, then I would like using Lagrange Multiplier (LM) test to select a suitable model between the spatial Lag model and the spatial error model for same independent and dependent in model3. If the LM test does not give a suitable output, I will change to use GWR model.

lb.queens_weight_ROW_model3 <- lb_qnb %>%
  nb2listw(., style="W")

lm.LMtests(model3, lb.queens_weight_ROW_model3, test = c("LMerr","LMlag","RLMerr","RLMlag","SARMA"))
## 
##  Lagrange multiplier diagnostics for spatial dependence
## 
## data:  
## model: lm(formula = f_obese^0.5 ~ fat + sugar + saturate + carb +
## fibre, data = .)
## weights: lb.queens_weight_ROW_model3
## 
## LMerr = 0.15199, df = 1, p-value = 0.6966
## 
## 
##  Lagrange multiplier diagnostics for spatial dependence
## 
## data:  
## model: lm(formula = f_obese^0.5 ~ fat + sugar + saturate + carb +
## fibre, data = .)
## weights: lb.queens_weight_ROW_model3
## 
## LMlag = 0.069299, df = 1, p-value = 0.7924
## 
## 
##  Lagrange multiplier diagnostics for spatial dependence
## 
## data:  
## model: lm(formula = f_obese^0.5 ~ fat + sugar + saturate + carb +
## fibre, data = .)
## weights: lb.queens_weight_ROW_model3
## 
## RLMerr = 0.12682, df = 1, p-value = 0.7218
## 
## 
##  Lagrange multiplier diagnostics for spatial dependence
## 
## data:  
## model: lm(formula = f_obese^0.5 ~ fat + sugar + saturate + carb +
## fibre, data = .)
## weights: lb.queens_weight_ROW_model3
## 
## RLMlag = 0.044129, df = 1, p-value = 0.8336
## 
## 
##  Lagrange multiplier diagnostics for spatial dependence
## 
## data:  
## model: lm(formula = f_obese^0.5 ~ fat + sugar + saturate + carb +
## fibre, data = .)
## weights: lb.queens_weight_ROW_model3
## 
## SARMA = 0.19612, df = 2, p-value = 0.9066

Seems like everything’s p-value is high, so I would like to do a GWR model to see the coefficient of each nutrients in boroughs.

Geographically weighted regression Mmdel

Now making a Geographically Weighted Regression Model.

coordsW2 <- st_coordinates(coordsW)

lb_profile4 <- cbind(lb_profile3,coordsW2)

# GWRbandwidth <- gwr.sel(f_obese^0.5 ~ fat+sugar+saturate+carb+fibre, 
#                  data = lb_profile4, 
#                   data = lb_profile4, 
#                         coords=cbind(lb_profile4$X, lb_profile4$Y),
#                   adapt=T)

# errors on finding the best bandwidth, just random pick one
gwr.model = gwr(f_obese^0.5 ~ fat+sugar+saturate+carb+fibre, 
                  data = lb_profile4, 
                coords=cbind(lb_profile4$X, lb_profile4$Y), 
                adapt=0.09016994,#if i cant find a bedt bandwidth, then set a random number
                #matrix output
                hatmatrix=TRUE,
                #standard error
                se.fit=TRUE)

#print the results of the model
gwr.model
## Call:
## gwr(formula = f_obese^0.5 ~ fat + sugar + saturate + carb + fibre, 
##     data = lb_profile4, coords = cbind(lb_profile4$X, lb_profile4$Y), 
##     adapt = 0.09016994, hatmatrix = TRUE, se.fit = TRUE)
## Kernel function: gwr.Gauss 
## Adaptive quantile: 0.09016994 (about 2 of 33 data points)
## Summary of GWR coefficient estimates at data points:
##                   Min.   1st Qu.    Median   3rd Qu.      Max.  Global
## X.Intercept.  6.104625 12.111983 16.409629 21.155350 43.880196 13.1524
## fat          -5.645676 -2.653788 -2.038285 -1.442799 -0.824777 -1.5944
## sugar        -0.031909  0.448531  0.669031  0.945537  1.637667  0.4993
## saturate     -5.176539 -0.445703  0.177993  1.965549  5.703637  0.4648
## carb         -0.399558 -0.046645  0.080902  0.227878  0.898984  0.1624
## fibre        -9.290865 -3.577115 -1.935203 -0.492981  1.609352 -2.4737
## Number of data points: 33 
## Effective number of parameters (residual: 2traceS - traceS'S): 23.31585 
## Effective degrees of freedom (residual: 2traceS - traceS'S): 9.684152 
## Sigma (residual: 2traceS - traceS'S): 0.4821393 
## Effective number of parameters (model: traceS): 18.7869 
## Effective degrees of freedom (model: traceS): 14.2131 
## Sigma (model: traceS): 0.3979778 
## Sigma (ML): 0.2611839 
## AICc (GWR p. 61, eq 2.33; p. 96, eq. 4.21): 111.972 
## AIC (GWR p. 96, eq. 4.22): 23.82982 
## Residual sum of squares: 2.251161 
## Quasi-global R2: 0.7753594
results <- as.data.frame(gwr.model$SDF)
names(results)
##  [1] "sum.w"               "X.Intercept."        "fat"                
##  [4] "sugar"               "saturate"            "carb"               
##  [7] "fibre"               "X.Intercept._se"     "fat_se"             
## [10] "sugar_se"            "saturate_se"         "carb_se"            
## [13] "fibre_se"            "gwr.e"               "pred"               
## [16] "pred.se"             "localR2"             "X.Intercept._se_EDF"
## [19] "fat_se_EDF"          "sugar_se_EDF"        "saturate_se_EDF"    
## [22] "carb_se_EDF"         "fibre_se_EDF"        "pred.se.1"          
## [25] "coord.x"             "coord.y"

Plot the coefficient of these 5 variables calculated by gwr model in map.

lb_profile4 <- lb_profile4 %>%
  mutate(coef_fat = results$fat,
         coef_sugar = results$sugar,
         coef_saturate = results$saturate,
         coef_carb = results$carb,
         coef_fibre = results$fibre,)

tmap_mode("view")
## tmap mode set to interactive viewing
tm_shape(lb_profile4) +
  tm_polygons(col = "coef_fat", 
              palette = "RdBu", 
              alpha = 0.5)
tmap_mode("view")
## tmap mode set to interactive viewing
tm_shape(lb_profile4) +
  tm_polygons(col = "coef_sugar", 
              palette = "RdBu", 
              alpha = 0.5)
## Variable(s) "coef_sugar" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.
tmap_mode("view")
## tmap mode set to interactive viewing
tm_shape(lb_profile4) +
  tm_polygons(col = "coef_saturate", 
              palette = "RdBu", 
              alpha = 0.5)
## Variable(s) "coef_saturate" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.
tmap_mode("view")
## tmap mode set to interactive viewing
tm_shape(lb_profile4) +
  tm_polygons(col = "coef_carb", 
              palette = "RdBu", 
              alpha = 0.5)
## Variable(s) "coef_carb" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.
tmap_mode("view")
## tmap mode set to interactive viewing
tm_shape(lb_profile4) +
  tm_polygons(col = "coef_fibre", 
              palette = "RdBu", 
              alpha = 0.5)
## Variable(s) "coef_fibre" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.

We can see here from the gwr map of how obesity be affected by several average nutrients in the food in each boroughs. - Fat in food has negative correlation to obesity in every boroughs. - Sugar in food has positive correlation to obesity in nearly every boroughs. - Saturate in food has positive correlation to obesity in nearly the south, east, west part of London, excep the north part. - Carb in food has positive correlation to obesity in south, east, west part of London. - Fibre in food has nearly negative correlation to obesity in every boroughs.

Plot the sigtest(like residuals) of variables(under 18) by map

# sigTest1 = abs(gwr.model$SDF$"fat")-2 * gwr.model$SDF$"fat_se"
# sigTest2 = abs(gwr.model$SDF$"sugar")-2 * gwr.model$SDF$"sugar_se"
# sigTest3 = abs(gwr.model$SDF$"saturate")-2 * gwr.model$SDF$"saturate_se"
# sigTest4 = abs(gwr.model$SDF$"carb")-2 * gwr.model$SDF$"carb_se"
# sigTest5 = abs(gwr.model$SDF$"fibre")-2 * gwr.model$SDF$"fibre_se"
# lb_profile4 <- lb_profile4 %>%
#   mutate(fat_sig = sigTest1) %>%
#   mutate(sugar_sig = sigTest2) %>%
#   mutate(saturate_sig = sigTest3) %>%
#   mutate(carb_sig = sigTest4) %>%
#   mutate(fibre_sig = sigTest5) %>%
# 
# 
# tmap_mode("view")
# 
# tm_shape(lb_profile4) +
#   tm_polygons(col = "fat_sig",
#               palette = "RdYlBu")
# 
# tmap_mode("view")
# 
# tm_shape(lb_profile4) +
#   tm_polygons(col = "sugar_sig",
#               palette = "RdYlBu")

Constraints

No time to fix the sig test, and mention about the conclusion about spatial pattern. If sig test comes out, we can make sure the right coefficient with high significance.

Conclusion

The sugar, saturate, carb in food has positive correlation to the adult obesity in most boroughs in London.In contract, the fibre, fat in food has negative correlation to the adult obesity. So the advice for government is to promoting for the food that contaitn high fibre, and appropriate healthy fat.